This report shows the QC steps for gene expression microarry data from GEO study, including:

Change the variables here

# GEO id
geo_id="GSE4917"
# direcotry stores GEO data and phenotype file
datadir="C:/Users/mengykan/Projects/AsthmaApp/data/microarray"
resdir="C:/Users/mengykan/Projects/AsthmaApp/res/microarray"

Install the prerequisite R packages

source("http://bioconductor.org/biocLite.R")
biocLite("GEOquery")
biocLite("oligo")
biocLite("affy") # for Affymetrix microarray-specific QC analysis
biocLite("viridis")
install.packages("ggplot2")
install.packages("gplots")
install.packages("Hmisc")
install.packages("devtools")
install.package("dplyr")
install.package("pander")

Load the necessary libraries

library(GEOquery)
library(oligo)
library(viridis) # heatmap colour
library(ggplot2)
library(gplots) # heatmap.2 plot
library(Hmisc) # compute hoeffd (Hoeffding's D statistics) for MA plot
library(devtools) # compuate PCs
library(pander)
library(dplyr)

Obtain and Inspect Raw Phenotype Data from GEO

This step needs inspection as phenotype data provided by researchers are different. In general, the information of sample id, sample geo id, tissue, disease status, treatment status (if applied) is required. Addition information includes age and gender.

Load a GDS file with GEOquery. Download from GEO server if the data does not exist

If ths study was performed in multiple platforms, choose the samples using Affymetrix Human Genome U133 (Plus 2.0) Array

# check if GEO matrix file is existed
geo_fn <- list.files(path=datadir)[grepl(geo_id,list.files(path=datadir))&grepl("matrix.txt.gz$",list.files(path=datadir))]
if (length(geo_fn)==0) { # matrix files are alreadly downloaded
  gselms <- getGEO(geo_id, destdir=datadir,GSEMatrix = TRUE) # dowanload matrix file
  print(attr(gselms, "names"))
  if (length(gselms)>1) {  # multiple platform
    message("This study was performed in multiple platforms")
    idx <- grep("GLP96|GPL570", attr(gselms, "names")) # only use data from GPL96
    gse <- gselms[[idx]]
  } else {gse <- gselms[[1]]}
} else if (length(geo_fn)==1) {
  print(geo_fn)
  gse <- getGEO(filename=paste0(datadir,"/",geo_fn),GSEMatrix = TRUE)
} else { # multiple platform
  print(geo_fn)
  message("This study was performed in multiple platforms")
  geo_fn <- geo_fn[grep("GLP96|GPL570",geo_fn)]
  gse <- getGEO(filename=paste0(datadir,"/",geo_fn),GSEMatrix = TRUE)
}
## [1] "GSE4917_series_matrix.txt.gz"

Primary Check for phenotype information.

pheno.df <- pData(phenoData(gse))
names(pheno.df)
##  [1] "title"                   "geo_accession"          
##  [3] "status"                  "submission_date"        
##  [5] "last_update_date"        "type"                   
##  [7] "channel_count"           "source_name_ch1"        
##  [9] "organism_ch1"            "characteristics_ch1"    
## [11] "characteristics_ch1.1"   "treatment_protocol_ch1" 
## [13] "molecule_ch1"            "extract_protocol_ch1"   
## [15] "label_ch1"               "label_protocol_ch1"     
## [17] "taxid_ch1"               "hyb_protocol"           
## [19] "scan_protocol"           "description"            
## [21] "data_processing"         "platform_id"            
## [23] "contact_name"            "contact_email"          
## [25] "contact_phone"           "contact_laboratory"     
## [27] "contact_department"      "contact_institute"      
## [29] "contact_address"         "contact_city"           
## [31] "contact_state"           "contact_zip/postal_code"
## [33] "contact_country"         "supplementary_file"     
## [35] "data_row_count"
dim(pheno.df)
## [1] 24 35
rm(gse) # remove this object as it will not be used

Show all the phenotype data from GEO study. If there are more than 10 levels of variables in a column, only show the first five levels of variables.

for (i in 1:ncol(pheno.df)) {
  if (length(levels(pheno.df[,i]))>=10) {
    res <- pheno.df[pheno.df[,i]%in%levels(pheno.df[,i])[1:5],]
    res[,i] <- factor(res[,i])
  } else {res <- pheno.df}
  res <- data.frame(table(res[,i]))
  names(res) <- c(names(pheno.df)[i],"counts")
  pandoc.table(res,justify = 'left',split.table = Inf)
}
title counts
Control(ethanol)-treated sample at 24hr, biological rep1 1
Control(ethanol)-treated sample at 24hr, biological rep2 1
Control(ethanol)-treated sample at 24hr, biological rep3 1
Control(ethanol)-treated sample at 2hr, biological rep1 1
Control(ethanol)-treated sample at 2hr, biological rep2 1
geo_accession counts
GSM109204 1
GSM109205 1
GSM109206 1
GSM109207 1
GSM109208 1
status counts
Public on Jun 02 2006 24
submission_date counts
May 17 2006 24
last_update_date counts
Apr 24 2013 24
type counts
RNA 24
channel_count counts
1 24
source_name_ch1 counts
MCF10A-Myc cells treated with dexamethasone for 24hr 3
MCF10A-Myc cells treated with dexamethasone for 2hr 3
MCF10A-Myc cells treated with dexamethasone for 30 min 2
MCF10A-Myc cells treated with dexamethasone for 30min 1
MCF10A-Myc cells treated with dexamethasone for 4hr 3
organism_ch1 counts
Homo sapiens 24
characteristics_ch1 counts
Genotype: unknown 24
characteristics_ch1.1 counts
Age: 35 24
treatment_protocol_ch1 counts
MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone (Dex) for 24hr. 1
MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone for 2 hr. 2
MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone for 24 hr. 2
MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone for 2hr. 1
MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone for 30 min. 3
molecule_ch1 counts
total RNA 24
extract_protocol_ch1 counts
Total RNA from each sample was extracted using Qiagenâ RNeasy Kit. 24 s
label_ch1 counts
biotin 24
label_protocol_ch1 counts
Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 6 microg total RNA (Expression Analysis Technical Manual, 2001, Affymetrix). 24
taxid_ch1 counts
9606 24
hyb_protocol counts
Following fragmentation, 10 microg of cRNA were hybridized for 16 hr at 45C on Affymetrix human genome HG-U133A genechip. GeneChips were washed and stained in the Affymetrix Fluidics Station 450. 24
scan_protocol counts
GeneChips were scanned using the Affymetrix GeneChip® Scanner 3000 7G. 19
GeneChips were scanned using the Affymetrix GeneChip® Scanner 3000 7G.. 5
description counts
Gene expression data from embryos in slow phase of cellularisation. 1
Gene expression data from MCF10A-Myc cells treated with dex for 2 hr. 1
Gene expression data from MCF10A-Myc cells treated with Dex for 2 hr. 1
Gene expression data from MCF10A-Myc cells treated with dex for 24 hr. 1
Gene expression data from MCF10A-Myc cells treated with Dex for 24 hr. 1
data_processing counts
The data were analyzed and normalized Robust Multi-array Average (RMA) algorithm. 24
platform_id counts
GPL96 24
contact_name counts
Suzanne,Daniela,Conzen 24
contact_email counts
sconzen@medicine.bsd.uchicago.edu 24
contact_phone counts
(773)834-2604 24
contact_laboratory counts
Conzen Lab 24
contact_department counts
Medicine 24
contact_institute counts
The University of Chicago 24
contact_address counts
5841 S. Maryland Ave, SBRI J301, MC 2115 24
contact_city counts
Chicago 24
contact_state counts
IL 24
contact_zip/postal_code counts
60637 24
contact_country counts
USA 24
supplementary_file counts
ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM109nnn/GSM109204/suppl/GSM109204.cel.gz 1
ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM109nnn/GSM109205/suppl/GSM109205.cel.gz 1
ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM109nnn/GSM109206/suppl/GSM109206.cel.gz 1
ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM109nnn/GSM109207/suppl/GSM109207.cel.gz 1
ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM109nnn/GSM109208/suppl/GSM109208.cel.gz 1
data_row_count counts
22215 24

Select required phenotype variables from the primary check and generate standardized columns. This is a manual step.

cols <- c("title","geo_accession","source_name_ch1")
pheno.new <- pheno.df %>%
  dplyr::select(cols) %>%
  dplyr::mutate(GEO_ID=geo_accession) %>%
  dplyr::mutate(Subject=gsub("^.*biological (.*)$","\\1",title)) %>%
  dplyr::mutate(Sample=paste(geo_accession,Subject,sep="_")) %>%
  dplyr::mutate(Tissue="MCF10A-Myc") %>%
  dplyr::mutate(treatment_time=gsub("^.*for (\\d+.*)$","\\1",source_name_ch1)) %>%
  dplyr::mutate(treatment_time=gsub(" ","",treatment_time)) %>%
  dplyr::mutate(treatment_drug=ifelse(grepl("dexamethasone",source_name_ch1),"dex","baseline")) %>%
  dplyr::mutate(Treatment=paste(treatment_drug,treatment_time,sep="_")) %>%
  dplyr::mutate(Age="35") %>%
  dplyr::mutate_if(is.character,as.factor)
pheno.new <- pheno.new[,(length(cols)+1):ncol(pheno.new)]
write.table(pheno.new,paste0(datadir,"/",geo_id,"_Phenotype_withoutQC.txt"),col.names=T,row.names=F,sep="\t",quote=F)
rm(pheno.new)

Check phenotype table and the sample size for different groups

# read in the phenotype file if it was created before
pheno <-read.table(paste0(datadir,"/",geo_id,"_Phenotype_withoutQC.txt"),header=T, sep="\t")
pandoc.table(pheno[1:5,],split.table=Inf) # show first 5 rows
GEO_ID Subject Sample Tissue treatment_time treatment_drug Treatment Age
GSM109204 rep1 GSM109204_rep1 MCF10A-Myc 30min baseline baseline_30min 35
GSM109205 rep1 GSM109205_rep1 MCF10A-Myc 30min dex dex_30min 35
GSM109206 rep1 GSM109206_rep1 MCF10A-Myc 2hr baseline baseline_2hr 35
GSM109207 rep1 GSM109207_rep1 MCF10A-Myc 2hr dex dex_2hr 35
GSM109208 rep1 GSM109208_rep1 MCF10A-Myc 4hr baseline baseline_4hr 35
avail_group=c("Tissue","Disease","Treatment")[c("Tissue","Disease","Treatment")%in%names(pheno)]
res=as.data.frame(table(pheno[,avail_group]))
names(res) <- c(avail_group,"Frequency")
pandoc.table(res,split.table=Inf)
Tissue Treatment Frequency
MCF10A-Myc baseline_24hr 3
MCF10A-Myc baseline_2hr 3
MCF10A-Myc baseline_30min 3
MCF10A-Myc baseline_4hr 3
MCF10A-Myc dex_24hr 3
MCF10A-Myc dex_2hr 3
MCF10A-Myc dex_30min 3
MCF10A-Myc dex_4hr 3

Quality Control for Microarray Data

Quality control includes:

The general QC steps and outlier detection methods are derived from arrayQualityMetrics

Detailed description of QC steps for microarray data can be found here

Obtain Gene Expression Raw Data

Download raw data files (.cel) if the data folder does not exist.

celfiles <- paste0(datadir,"/",geo_id,"/data/",gsub("^.*/(.*)$","\\1",pheno.df$supplementary_file)) # get .cel file names to read in. This step is important for samples scanned in different platforms
if (!all(celfiles%in%list.files(path=paste0(datadir,"/",geo_id,"/data"),pattern=".CEL.gz|.cel.gz",full.names=T))) { # if cel files are not downloaded or not all the sample cel files exsist
  getGEOSuppFiles(geo_id,baseDir=datadir) #download GEO files
  untar(paste0(datadir,"/",geo_id,"/",geo_id,"_RAW.tar"), exdir=paste0(datadir,"/",geo_id,"/data")) # extract the zip file
}

Read in raw data. Generate raw data object under ExpressionFeatureSet (oligo class).

raw.data <- read.celfiles(celfiles)

Obtain scan date from raw data used for batch information

# As the scan date and scan time are usually joined by "T" or a white space, use both pattern to split the date with time
pheno$ScanDate_Group <- sapply(strsplit(as.character(protocolData(raw.data)$dates), "T| "), function(x) {x[[1]]})
pheno$ScanDate_Group <- as.factor(pheno$ScanDate_Group)
# assign colours to Scan Date for plots
colours=c("#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#8DD3C7", "#FFFFB3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#D9D9D9", "#BC80BD", "#CCEBC5", "#FFED6F") # first 8 colour names derived from Dark2, and last 12 names from Set3
i=length(levels(pheno$ScanDate_Group))
colour_list <- colours[1:i]
names(colour_list) <- levels(pheno$ScanDate_Group) # colour to corresponding scan date
# assign pheno type data to raw expression data
pData(raw.data) <- pheno
row.names(pData(raw.data)) <- sampleNames(protocolData(raw.data))
pandoc.table(pData(raw.data)[1:5,],split.table = Inf) # show first 5 rows
  GEO_ID Subject Sample Tissue treatment_time treatment_drug Treatment Age ScanDate_Group
GSM109204.cel.gz GSM109204 rep1 GSM109204_rep1 MCF10A-Myc 30min baseline baseline_30min 35 05/23/02
GSM109205.cel.gz GSM109205 rep1 GSM109205_rep1 MCF10A-Myc 30min dex dex_30min 35 05/23/02
GSM109206.cel.gz GSM109206 rep1 GSM109206_rep1 MCF10A-Myc 2hr baseline baseline_2hr 35 05/23/02
GSM109207.cel.gz GSM109207 rep1 GSM109207_rep1 MCF10A-Myc 2hr dex dex_2hr 35 05/23/02
GSM109208.cel.gz GSM109208 rep1 GSM109208_rep1 MCF10A-Myc 4hr baseline baseline_4hr 35 05/23/02
# summary of ScanDate_Group
res=as.data.frame(table(pData(raw.data)$ScanDate_Group))
names(res) <- c("ScanDate_Group","Frequency")
pandoc.table(res,split.table=Inf) # show first 5 rows
ScanDate_Group Frequency
04/13/05 8
05/23/02 7
05/31/02 8
06/03/02 1

Check if the sample names derived from expression data match those in phenotype file

Raw Probe Intensity Boxplots and Density Histograms

The signal intensity distributions of all samples (arrays) are expected to have the similar scale (i.e. the similar positions and widths of the boxes). Use Kolmogorov-Smirnov statistic to detect the arrays that have values deviated from others, which may indicate an experimental problem.

  1. Compute log2 transformed raw probe intensity
raw.data.log2 <- log2(exprs(raw.data))
dim(raw.data.log2)
## [1] 506944     24

Use subsamp function to reduce intensity data with randomly selected 20000 probes

# subset data
subsamp <- function(x) {
  subsample=20000 # if number of probes are >20000, randomly select 20000 probes for plot or compute
  if (nrow(x)>subsample) {
    ss  = sample(nrow(x), subsample)
    Mss = x[ss,,drop=FALSE]
  } else {
    ss  = TRUE
    Mss = x
  }
  Mss
}
Mss_raw <- subsamp(raw.data.log2)
  1. Outlier detection for raw probe intensity

Compute the Kolmogorov-Smirnov statistic Ka between each array’s (i.e. sample) values (i.e. log2 transformed raw probe intensity values) and the pooled, overall distribution of the values. Detect outliers that are deviated from the threshold using outlier_detection_KS function.

# compute KS statistics to detect outliers
outlier_detection_KS = function(exprs) { # matrix (row: probe intensities/RLE values etc., col: array (e.g. sample))
  fx = ecdf(as.vector(exprs)) # get empirical cumulative distribution function of the data
  KS=suppressWarnings(apply(exprs, 2, function(v)ks.test(v, y = fx, alternative="two.sided")$statistic))
  stats = stats::fivenum(KS, na.rm = TRUE) # Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum)
  iqr = diff(stats[c(2, 4)]) # lagged difference between the lower-hinge and upper-hinge
  coef = 1.5
  th = (stats[4] + coef * iqr)
  list(threshold = th, stats=KS, outlier = which(KS > th))
}
outlier_raw = names(outlier_detection_KS(Mss_raw)$outlier)
  1. Boxplot for raw probe intensities

Use boxplot_func function to plot raw probe intensities. Outliers marked with * have different raw proble intensity distributions from others

# boxplot function
boxplot_func <- function(Mss,outlier,ylab) {
  # use * to mark the outliers in boxplot
  array_name <- colnames(Mss)
  array_name[array_name%in%outlier] <- paste0("*",outlier)
  # boxplot RLE by array
  ylim = quantile(Mss, probs = c(0.01, 0.99), na.rm=TRUE) # create range of y-axsis
  # create data frame for plot
  df <- data.frame(
    sample_id=rep(colnames(Mss),each=nrow(Mss)),
    values=as.numeric(Mss),
    scandate=rep(pData(raw.data)$ScanDate_Group,each=nrow(Mss)) # for color
  )
  cols <- colour_list

  ggplot(df, aes(sample_id,values,fill=scandate)) + geom_boxplot(outlier.colour=NA) +
    coord_flip() + theme_bw() +
    ylim(ylim) +
    scale_x_discrete(labels=array_name) +
    ylab(ylab) +
    scale_fill_manual("Scan Date",values=cols) +
    theme(axis.title.y=element_blank())
}
boxplot_func(Mss_raw,outlier_raw,"Raw Probe Intensities")

  1. Density smoothed histograms of log2 transformed raw probe intensities.

This step requires manual inspection. The intensity curves of all samples (arrays) are expected to have the similar shapes and ranges. Samples with deviated curves are likely to have problematic experiments. Various features of the distributions can be indicative of quality related phenomena. For instance, high levels of background will shift an array’s distribution to the right. Lack of signal diminishes its right right tail. A bulge at the upper end of the intensity range often indicates signal saturation.

# create data frame for plot
df <- data.frame(
  sample_id=rep(colnames(Mss_raw),each=nrow(Mss_raw)),
  values=as.numeric(Mss_raw),
  scandate=rep(pData(raw.data)$ScanDate_Group,each=nrow(Mss_raw)) # for color
)
cols <- colour_list

ggplot(df,aes(x=values,colour=scandate)) + geom_line(aes(group=sample_id),stat="density") +
  theme_bw() +
  xlab("Raw Probe Intensities") +
  ylab("Density") +
  scale_color_manual("Scan Date",values=cols)

rm(Mss_raw,df)

Distribution of Perfect Match (PM) and Mismatch (MM)

This step is Affymetrix microarray-specific QC analysis. There are two paired probe types: perfect match (PM) and of mismatched (MM) probes. A PM probe matches a strand of cDNA, while the corresponding MM probe differs from the PM by a change in the central nucleotide.

A probe set is called present if the intensity value of PM is significantly larger than MM.

The Affymetrix approach is under attack because between 15% - 30% of the MM are greater than the PM. For some newer arrays, MM probes are not used. If the number of PMs is not equal to that of MMs, this might be a PM-only array.

If this study has both PM and MM, a plot will be generated that shows the density distributions of the log2-transformed intensities grouped by the matching type of the probes. We expect that MM probes have poorer hybridization than PM probes, and thus that the PM curve is to the right of the MM curve.

if (class(raw.data)=="GeneFeatureSet") {
  message("GeneFeatureSet does not have MM probes. No plots will be generated.")
} else {
  PM <- log2(pm(raw.data))
  MM <- log2(mm(raw.data))
  if(nrow(PM) != nrow(MM)) {
    message("This might be a PM-only array. No plots will be generated.")
    rm(PM,MM)
  } else {
    subsample=20000
    if(nrow(PM)>subsample) { # randomly select 20000 probe sets
      sel = sample(nrow(PM), subsample)
      sPM = PM[sel, ]
      sMM = MM[sel, ]
    } else {
      sPM = PM
      sMM = MM
    }
    rm(PM,MM) 
  
    df <- data.frame(
      values=c(as.numeric(sPM),as.numeric(sMM)),
      types=c(rep("PM",each=length(as.numeric(sPM))),rep("MM",each=length(as.numeric(sMM))))
    )
    cols=colours[1:2]
    ggplot(df,aes(x=values,colour=types)) + geom_line(aes(group=types),stat="density") +
      theme_bw() +
      xlab("Intensity") +
      ylab("Density") +
      scale_color_manual(values=cols) +
      theme(legend.title=element_blank())
  }
}

rm(sPM,sMM,df)

RNA Digestion

Overall RNA quality can be assessed by RNA degradation plots. In the gene expression array, each gene is represented by a probe set. Each probe set is 11-20 probes (pairs of oligos). This plot shows the average intensity of each probe across all probe sets, ordered from the 5´ to the 3´ end. It is expected that probe intensities are lower at the 5´ end of a probe set when compared to the 3´end as RNA degradation starts from the 5´end of a molecule. RNA which is too degraded will show a very high slope from 5´ to 3´. Thus, the standardized slope of the RNA degradation plot serves as quantitative indicator of the RNA degradation.

This step is an Affymetrix microarry-specific QC step, as the probes in each probe set matrix generated by affy package are ordered from 5’ to 3’, which is not implemented in oligo package.

  1. Read in raw data as an AffyBatch object

Generate raw data object under AffyBatch (affy class) for affymetrix microarray-specific QC analysis.

library(affy) # for Affymetrix microarray-specific QC analysis
raw.data.affy <- read.affybatch(celfiles,compress=T)
  1. Compute mean PM intensity for probes following 5’ to 3’ order

Compute mean PM intensities in each probe position and plot in the next step. Adopt AffyRNAdeg function from affy package but include a step that randomly selects 20000 probe sets.

RNAdegaffy <- function(data){ # input a list of probe set matrix with rows as probe ids and columns as samples
  {
    names <- colnames(data[[1]])
    probe.set.size <- function(x) {
      size <- dim(x)[1]
      return(size)
    }
    max.num <- sapply(data, probe.set.size) # get the number of probes in each probe set
    tab <- (table(max.num)) # summarize the frequencies of probe numbers in probe sets
    ord <- order(-as.numeric(tab)) # order the frequency from large to small
    K <- as.numeric(names(tab))[ord[1]] # K is the number of probes appearing in most probe sets
    data <- data[max.num == K] # select data of probe sets only have K number of probes
  }
  
  subsample=20000
  if (length(data)>subsample) { # randomly select 10000 probe sets
    ss = sample(length(data),subsample)
    data = data[ss,drop=FALSE]
  }

  N <- length(data) # number of probe sets
  n <- dim(data[[1]])[2] # number of samples
  
  # create two matrices: number of samples * number of probes representing a probe set
  mns <- matrix(nrow = n, ncol = K) # create matrix for mean values
  sds <- mns # create matrix for sds values

  get.row <- function(x, i = 1) {return(x[i, ])} # function to get each row (i.e. probe id, i) from one probe set x (i.e. probe list[[x]])
  rowstack <- function(x, i = 1) {return(t(sapply(x, get.row, i)))} # function to combine the rows obtained using get.row (pms across samples by probe sets) to get a table (row: samples column: probe sets) and transpose the table (row: probe sets, column: samples)

  for (i in 1:K) { # get probe id (position) from 1 to K from each probe set
    data.stack <- rowstack(data, i) # get the probe pm values in a specific probe position across all samples from each probe set (rows are samples and columns are probe sets)
    if(dim(data[[1]])[2]==1) data.stack <- t(data.stack)
    mns[, i] <- colMeans(data.stack) # get the mean values at one probe position across all probe sets
    sds[, i] <- apply(data.stack, 2, sd) # get the sd values at one probe position across all probe sets
  }
    
  mns.orig <- mns # store the original mns data matrix
  mn <- mns[, 1] # select values in the first probe position
  mns <- sweep(mns, 1, mn) # adjust for the intensity at the first probe position
  mns <- mns/(sds/sqrt(N)) # adjust for standard error
  lm.stats <- function(x) {
    index <- 0:(length(x) - 1)
    ans <- summary(lm(x ~ index))$coefficients[2, c(1, 4)] # use linear model fit the relationship between intensity and probe position
    return(ans)
  }
  stats <- apply(mns, 1, lm.stats)
  answer <- list(N, names, mns.orig, sds/sqrt(N), stats[1,], stats[2, ])
  names(answer) <- c("N", "sample.names", "means.by.number","ses", "slope", "pvalue")
  return(answer)
}

Obtain a list of probe sets with a matrix of oligos (probes) by samples as an input and compute statistics of the mean PM intensities from 5’ to 3’ probe positions.

PM_list <- affy::pm(raw.data.affy,LIST=T) 
PM_list <- lapply(PM_list,log2)
raw.data.rnadeg <- RNAdegaffy(PM_list)
rm(PM_list)
  1. Plot 5’ to 3’ mean PM intensity
status.cols <- unlist(lapply(pData(raw.data)$ScanDate_Group,function(x)colour_list[x])) # colour list to corresponding scan date list
plotAffyRNAdeg(raw.data.rnadeg,cols=status.cols)
legend("topleft",legend=names(colour_list),fill=colour_list,cex=0.6)

detach("package:affy", unload=TRUE) # detach affy package

MA Plots

MA plots allow pairwise comparison of the log-intensity of each array to a reference array and identification of intensity-dependent biases.

The y-axis of the MA-plot shows the log-ratio intensity of one array to the reference median array, which is called M (minus). M = log2(I1)-log2(I2) (I1: the intensity of the array studied; I2: the median intensity across arrays)

The x-axis indicates the average log-intensity of both arrays, which is called A (add). A = 1/2*(log2(I1)+log2(I2))

It is expected that the probe levels do not differ systematically within a group of replicates, so that the MA-plot is centered on the y-axis (y=0 or M=0) from low to high intensities.

Use the Hoeffding’s statistic Da to detect the outliers.

  1. Compute M-A matrices
MA_cal <- function(x) { # matrix (row: probe intensities, col: array (samples)
  medArray = rowMedians(x, na.rm=TRUE)
  M =  x - medArray
  A = (x + medArray)/2
  subsample=20000
  if(nrow(M)>subsample) {
    sel = sample(nrow(M), subsample)
    sM = M[sel, ]
    sA = A[sel, ]
  } else {
    sM = M
    sA = A
  }
  list(M=sM,A=sA) # return a list with M and A data matrices
}
sMA <- MA_cal(raw.data.log2)
  1. Outlier detection for MA

Compute the Hoeffding’s statistic Da (hoeffd function from Hmisc package) on the joint distribution of A and M for each array. Detect outliers that have a Da value greater than 0.15, the default threshold.

outlier_detection_MA = function(exprs) { # list with M and A data matrices
  M=exprs$M
  A=exprs$A
  Dstats = sapply(1:ncol(M), function(x){hoeffd(A[,x], M[,x])$D[1,2]})
  names(Dstats) <- colnames(M)
  Dthresh = 0.15
  list(threshold=Dthresh, stats=Dstats, outlier = which(Dstats > Dthresh))
}
outlier_res_MA <- outlier_detection_MA(sMA)
outlier_MA <- names(outlier_res_MA$outlier)
  1. MA plots

The MA plots show samples with the first 4 highest and lowest values of Da. The value of Da for each sample is shown in the panel headings. Outliers marked with * have Da values >0.15.

# select arrays with top 4 highest and lowest Da
stats_order <- order(outlier_res_MA$stats)
column_sel <- stats_order[c(1:4,(ncol(sMA$M)-3):ncol(sMA$M))]
stats_sel <- round(outlier_res_MA$stats[column_sel],2)
scandate_sel <- pData(raw.data)$ScanDate_Group[column_sel]
M_sel <- sMA$M[,column_sel]
A_sel <- sMA$A[,column_sel]
# use * to mark the outliers
array_name <- colnames(M_sel)
array_name[array_name%in%outlier_MA] <- paste0("*",array_name[array_name%in%outlier_MA])
array_name <- paste0(array_name," (D=",stats_sel,")") # add D statistics to corresponding samples
# create data frame for plot
df <- data.frame(
  sample_id=rep(array_name,each=nrow(M_sel)),
  scandate=rep(scandate_sel,each=nrow(M_sel)),
  M=as.numeric(M_sel),
  A=as.numeric(A_sel)
)
# MA plots
ggplot(df,aes(x=A,y=M,color=scandate)) + geom_point(alpha=0.1) + theme_bw() +
  scale_color_manual("Scan Date",values=colour_list) +
  facet_wrap(~sample_id,ncol=2)

rm(sMA,df)

Spatial Distribution

Spatial images are artificial visualizations of an array that are created to detect spatial trends or biases on an array. Upon scanning, a scale factor is applied to each array of a dataset in order to equalize the arrays mean intensities. A dataset of good quality should have comparable scale factors with limited variation.

Normally, when the features are distributed randomly on the arrays, one expects to see a uniform distribution; control features with particularly high or low intensities may stand out. The rank scale is applied for plotting as it has the potential to amplify patterns that are small in amplitudebut systematic within an array.

This step is an Affymetrix microarry-specific QC step, as the AffyBatch object from affy package contains information of spatial x- and y-coordinate that is not implemented in oligo package.

  1. Compute spatial x- and y-coordinate using raw data object classed as AffyBatch
library(affy)
affy_spatial <- function(affybatch) {
  maxc = ncol(affybatch) # number of probes in x-coordinate
  maxr = nrow(affybatch) # number of probes in y-coordinate
  sx = rep(seq_len(maxc), each = maxr) ## spatial x-coordinate
  sy = rep(seq_len(maxr), maxc) ## spatial y-coordinate
  M = log2(affy::exprs(affybatch))
  numArrays = dim(M)[2]
  list(M=M,numArrays=numArrays,sx=sx,sy=sy)
}
raw.data.spatial <- affy_spatial(raw.data.affy)
detach("package:affy", unload=TRUE) # detach affy package
  1. Outlier detection

Compute the sum of the absolutes value of low frequency Fourier coefficients (Fa) across all the probes as a measure of large scale spatial structures. Detect outliers that have a Fa greater than the threshold.

outlier_detection_spatial <- function(affy_spatial_list) {
  sx=affy_spatial_list$sx # spatial x-coordinate
  sy=affy_spatial_list$sy # spatial y-coordinate
  M=affy_spatial_list$M
  numArrays=affy_spatial_list$numArrays
  maxx = max(sx, na.rm=TRUE)
  maxy = max(sy, na.rm=TRUE)
  stat_spatial = numeric(numArrays)
  for(a in seq_len(numArrays)) {
    mat = matrix(NA_real_, nrow=maxy, ncol=maxx)
    mat[cbind(sy, sx) ] = M[, a]
    pg  = fft(mat) ## periodogram, computes the discrete fourier transform
    npg = Re(pg*Conj(pg))
    npg[1,1] = 0 ## drop the constant component
    stat_spatial[a] = sqrt(sum(npg[1:4, 1:4]) / sum(npg)) # low frequency power
  }
  names(stat_spatial)=colnames(M)
  stats = stats::fivenum(stat_spatial, na.rm = TRUE) # Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum)
  iqr = diff(stats[c(2, 4)]) # lagged difference between the lower-hinge and upper-hinge
  coef = 1.5
  th = (stats[4] + coef * iqr)
  list(threshold = th, stats=stat_spatial, outlier = which(stat_spatial > th))
}
outlier_res_spatial=outlier_detection_spatial(raw.data.spatial)
outlier_spatial=names(outlier_res_spatial$outlier)
  1. Spatial distribution plots

The spatial distribution plots show samples with the first 4 highest and lowest values of Fa. The value of Fa for each sample is shown in the panel headings. Outliers marked with * have Fa values of large scale spatial structures.

# select arrays with top 4 highest and lowest Da
stats_order <- order(outlier_res_spatial$stats)
column_sel <- stats_order[c(1:4,(ncol(raw.data.spatial$M)-3):ncol(raw.data.spatial$M))]
stats_sel <- round(outlier_res_spatial$stats[column_sel],2)
M_sel <- raw.data.spatial$M[,column_sel]
# apply rank to expression data
M_sel = apply(M_sel, 2, rank)
# use * to mark the outliers
array_name <- colnames(M_sel)
array_name[array_name%in%outlier_spatial] <- paste0("*",array_name[array_name%in%outlier_spatial])
array_name <- paste0(array_name," (F=",stats_sel,")") # add F statistics to corresponding samples
# create variables for plot
df <- data.frame(
  sample_id=rep(array_name,each=nrow(M_sel)),
  M=as.numeric(M_sel),
  row=rep(raw.data.spatial$sy,ncol(M_sel)),
  column=rep(raw.data.spatial$sx,ncol(M_sel))
)
# spatial distribution plots
ggplot(df,aes(x=row,y=column,fill=M)) + geom_tile() + 
  theme_bw() +
  xlab("Raw Probe Intensiry in X") + ylab("Raw Probe Intensiry in Y") +
  scale_fill_gradientn(name="Ranked Intensity",colours=viridis(256,option="B")) +
  facet_wrap(~sample_id,ncol=2)

rm(raw.data.spatial,df)

Relative Log Expression (RLE) Outlier Detection and Plots

The RLE values are computed by calculating for each probe-set the ratio between the expression of a probe-set and the median expression of this probe-set across all arrays of the experiment.

It is assumed that most probe-sets are not changed across the arrays, so it is expected that these ratios are around 0 on a log scale.

The boxplots presenting the distribution of these log-ratios should then be centered near 0 and have similar spread. Other behavior would be a sign of low quality.

  1. Compute RLE

First, fit probe level models for quality assessment using fitProbeLevelModel from Oligo package. Then, use RLE and NUSE functions to compute the RLE and NUSE values respectively.

raw.data2 <- raw.data # As fitPLM needs ExpressionFeatureSet object as an input, assign log transformed expression values to a new object "raw.data2"
exprs(raw.data2) <- raw.data.log2
fitPLM <- fitProbeLevelModel(raw.data)
fitPLM
rm(raw.data2)
M_RLE <- RLE(fitPLM, type="values") # generate RLE matrix
Mss_RLE <- subsamp(M_RLE) # use subsamp function to reduce RLE data with randomly selected 20000 probes
  1. Outlier detection for RLE

Compute the Kolmogorov-Smirnov statistic Ra between each array’s (i.e. sample) values (i.e. relative log expression values) and the pooled, overall distribution of the values. Detect outliers that are deviated from the threshold.

outlier_RLE = names(outlier_detection_KS(Mss_RLE)$outlier) # compute KS statistics to detect outliers
  1. Boxplot for RLE

Use boxplot_func function to plot RLE. Outliers marked with * have values centered away from 0 and/or are more spread out are potentially problematic.

boxplot_func(Mss_RLE,outlier_RLE,"RLE")

rm(Mss_RLE)

Normalized Unscaled Standard Error (NUSE) Outlier Detection and Plots

The NUSE plot represents normalized standard error estimates from the Probe-Level Model (PLM) fit which computes expression measures on a probe set by probe set basis.

The standard error estimates are normalized so that the median standard error for each probe set across all arrays is equal to 1. A box plot of NUSE values is drawn for each array in a dataset and allows checking whether all distributions are centered near 1 and whether an array shows a globally higher spread of the NUSE distribution than the other arrays. Typically, a box centered above 1.1 is indicative of an array with quality problems

  1. Compute NUSE
M_NUSE <- NUSE(fitPLM, type="values") # generate NUSE matrix
Mss_NUSE <- subsamp(M_NUSE) # use subsamp function to reduce RLE data with randomly selected 20000 probes
  1. Outlier detection for NUSE

Compute 75% quantile Na of each array’s NUSE values Detect outliers that have larger Na deviated from the threshold.

# compute upper 75% quantile as statistics to detect outliers
outlier_detection_upperquartile = function(exprs) { # matrix (row: NUSE values, col: array (e.g. sample))
  upperquartile = apply(exprs, 2, quantile, na.rm=TRUE, probs=0.75)
  stats = stats::fivenum(upperquartile, na.rm = TRUE) # Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum)
  iqr = diff(stats[c(2, 4)]) # lagged difference between the lower-hinge and upper-hinge
  coef = 1.5
  th = (stats[4] + coef * iqr)
  list(threshold = th, outlier = which(upperquartile > th))
}
outlier_NUSE = names(outlier_detection_upperquartile(Mss_NUSE)$outlier) # compute upper 75% quantile statistics to detect outliers
  1. Boxplot for NUSE

Use boxplot_func function to plot RLE. Outliers marked with * have values centered away from 0 and/or are more spread out are potentially problematic.

boxplot_func(Mss_NUSE,outlier_NUSE,"NUSE")

rm(Mss_NUSE)

Distance between Samples and Outlier Detection

Correlation between arrays was evaluated by the performance of hierarchical clustering of arrays. Hierarchical clustering analysis is performed in two steps: first, the distances between all pairs of arrays are computed and second, a tree is created from these distances by repeatedly grouping those (groups of) arrays together that are closest to each other.

The distance d(ab) between two arrays a and b is computed as the mean absolute difference (L1-distance) between the data of the arrays (using the data from all probes without filtering). In formula, d(ab) = mean | M(ai) - M(bi) |, where M(ai) is the value of the i-th probe on the a-th array.

Outlier detection was computed using the sum of the distances to all other arrays, of which samples who have exceptionally large S(a) = Sum(b)d(ab) are outliers.

  1. Compute distance between samples

Distance estimation using dist2 function derived from genefilter R package

# dist2 estimation
dist2 = function (x,
                  fun = function(a, b) mean(abs(a - b), na.rm = TRUE),
                  diagonal = 0) {

  if (!(is.numeric(diagonal) && (length(diagonal) == 1)))
    stop("'diagonal' must be a numeric scalar.")

  if (missing(fun)) {
    res = apply(x, 2, function(w) colMeans(abs(x-w), na.rm=TRUE))
  } else {
    res = matrix(diagonal, ncol = ncol(x), nrow = ncol(x))
    if (ncol(x) >= 2) {
      for (j in 2:ncol(x))
        for (i in 1:(j - 1))
          res[i, j] = res[j, i] = fun(x[, i], x[, j])
    } # if
  } # else
  colnames(res) = rownames(res) = colnames(x)
  return(res)
}
m <- dist2(raw.data.log2)
dend = as.dendrogram(hclust(as.dist(m), method = "single"))
ord = order.dendrogram(dend)
  1. Outlier detection for sample distance
outlier_detection_dist = function(exprs) { # matrix (row: distance to each sample, col: array (e.g. sample))
  sum = colSums(exprs, na.rm=TRUE) # sum the total distance
  stats = stats::fivenum(sum, na.rm = TRUE) # Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum)
  iqr = diff(stats[c(2, 4)]) # lagged difference between the lower-hinge and upper-hinge
  coef = 1.5
  th = (stats[4] + coef * iqr)
  list(threshold = th, outlier = which(sum > th))
}
outlier_dist = names(outlier_detection_dist(m)$outlier)
  1. Plot distance between samples
array_name <- colnames(m)
array_name[array_name%in%outlier_dist] <- paste0("*",outlier_dist)
array_name <- gsub(".CEL.gz|.cel.gz","",array_name) # shorten the sample id
status.cols <- unlist(lapply(pData(raw.data)$ScanDate_Group,function(x)colour_list[x])) # colour list to corresponding scan date list
heatmap.2(m,,Rowv=dend,Colv=dend,
  col=viridis(256, option="B"),ColSideColors=status.cols,RowSideColors=status.cols,
  labCol=array_name,labRow=array_name,
  trace="none",
  margins=c(12,20), # (bottom margin, left margin)
  cexRow = 1,cexCol = 1,
  keysize=1.5,key.title=NA,key.xlab="Dist2",key.ylab="Counts")
legend("bottomleft",legend=names(colour_list),fill=colour_list,cex=0.6)

rm(m)

Principal Component Analysis (PCA)

PCA plots the information contained in the dataset in a reduced number of dimensions. Clustering and PCA plots enable the assessment to what extent arrays resemble each other, and whether this corresponds to the known resemblances of the samples.

  1. Compute PCs and variance explained by the first 10 PCs
# obtain original expression data
raw.data.pca <- na.omit(raw.data.log2)
# As scale function divides by the variance, the probe with the expression sd=0 across samples must be removed.
sd <- apply(raw.data.pca,1,sd)
raw.data.pca <- raw.data.pca[!sd==0,]
# compute pcs
pca <- prcomp(t(raw.data.pca), retx = TRUE, center = TRUE, scale = TRUE)
pc <- data.frame(pca$x)
# compute variance explained by each PC
vars <- pca$sdev^2
pcs <- t(pc)
pvars <- vars*100.0/sum(vars) # proportion of variance (%) explained by each PC
cumsum_pvars <- cumsum(pvars) # Cumulative Proportion of Variance (%)
if (nrow(pcs)>10) {nres <- 10} else {nres=nrow(pcs)} # select top 10 PCs if number of PCs >10
res <- data.frame(rownames(pcs),pvars,cumsum_pvars)[1:nres,]
names(res) <- c("PC","Proportion of Variance (%)","Cumulative Proportion of Variance (%)")
pandoc.table(res,split.table = Inf)
PC Proportion of Variance (%) Cumulative Proportion of Variance (%)
PC1 85.75 85.75
PC2 5.7 91.45
PC3 2.186 93.64
PC4 1.211 94.85
PC5 0.979 95.83
PC6 0.64 96.47
PC7 0.4505 96.92
PC8 0.4126 97.33
PC9 0.3629 97.7
PC10 0.3118 98.01
rm(raw.data.pca)
  1. PCA plots

PCA plots by scan date

cols <- colour_list
df <- data.frame(
  PC1=pc$PC1,
  PC2=pc$PC2,
  scandate=pData(raw.data)$ScanDate_Group
)
ggplot(df,aes(PC1,PC2,color=scandate)) + geom_point() +
  theme_bw() +
  scale_color_manual("Scan Date",values=cols)

PCA plots by treatment if more than 2 subjects involving in the same treatment condition

if(("Treatment"%in%colnames(pData(raw.data)))&(min(summary(pData(raw.data)$Treatment))>2)) {
  # Assign colour to treatment status
  i=nlevels(pheno$Treatment)
  colour_treat <- colours[1:i]
  names(colour_treat) <- levels(pData(raw.data)$Treatment) 
  cols <- colour_treat
  df <- data.frame(
    PC1=pc$PC1,
    PC2=pc$PC2,
    group=pData(raw.data)$Treatment
  )
  ggplot(df,aes(PC1,PC2,color=group)) + geom_point() +
    theme_bw() +
    scale_color_manual("Treatment",values=cols)
}

PCA plots by donor if the same donor involving in more than 2 tissue/treatment conditions

pData(raw.data)$Subject<-as.factor(pData(raw.data)$Subject)
if(("Treatment"%in%colnames(pData(raw.data)))&(min(summary(pData(raw.data)$Subject))>2)) {
  # Assign colour to subject status
  i=nlevels(pData(raw.data)$Subject)
  colour_subj <- colours[1:i]
  names(colour_subj) <- levels(pheno$Subject) 
  cols <- colour_subj
  df <- data.frame(
    PC1=pc$PC1,
    PC2=pc$PC2,
    group=pData(raw.data)$Subject
  )
  ggplot(df,aes(PC1,PC2,color=group)) + geom_point() +
    theme_bw() +
    scale_color_manual("Donor",values=cols)
}

rm(pca)

QC Summary

Summary outliers and detection methods in a table

outlier_all <- c(outlier_raw,outlier_MA,outlier_spatial,outlier_RLE,outlier_NUSE,outlier_dist) # all outliers
if (length(outlier_all)==0) {
  message("No outlier was detected")
  outlier_freq <- data.frame() # create an empty data frame
} else {
  outlier_method <- c(rep("Raw Probe Intensity",length(outlier_raw)),
    rep("MA",length(outlier_MA)),
    rep("Spatial Distribution",length(outlier_spatial)),
    rep("RLE",length(outlier_RLE)),
    rep("NUSE",length(outlier_NUSE)),
    rep("Dist2",length(outlier_dist))) # detection methods for the outliers
  outlier_list <- list() # create empty list to store outliers and their corresponding methods
  for (i in 1:length(outlier_all)){outlier_list[[outlier_all[i]]] <- append(outlier_list[[outlier_all[i]]],outlier_method[i])} # assign methods to each outlier
  # summary table
  Frequency <- sapply(outlier_list,length) # times to detect
  Method <- unlist(lapply(names(outlier_list),function(x){paste0(outlier_list[[x]],collapse=", ")}))
  outlier_freq <- data.frame(Frequency,Method)
  pandoc.table(outlier_freq[order(outlier_freq$Frequency),],caption="Outlier Summary",split.table = Inf)
}
Outlier Summary
  Frequency Method
GSM109212.cel.gz 1 MA
GSM109217.cel.gz 1 MA
GSM109210.cel.gz 1 NUSE
GSM109215.cel.gz 3 Spatial Distribution, RLE, NUSE

Prepare Phenotype Data with QC

Create a new column “QC_Pass” in phenotype data. Samples detected as outliers more than twice are assigned to 0 otherwise to 1.

if (nrow(outlier_freq)==0) { # if no outlier was detected
  pData(raw.data)$QC_Pass <- rep(1,nrow(pData(raw.data))) # assign 1 to all the samples 
} else {
  outliers <- row.names(outlier_freq)[which(outlier_freq$Frequency>2)] # remove samples that were detected as an outlier more than twice
  pData(raw.data)$QC_Pass <- rep(NA,nrow(pData(raw.data))) # add column QC_Pass in phnotype data
  pData(raw.data)$QC_Pass[rownames(pData(raw.data))%in%outliers] <- 0 # assign 0 to outliers to be removed
  pData(raw.data)$QC_Pass[!rownames(pData(raw.data))%in%outliers] <- 1 # assign 1 to samples to be kept
}
pandoc.table(pData(raw.data),split.table = Inf)
  GEO_ID Subject Sample Tissue treatment_time treatment_drug Treatment Age ScanDate_Group QC_Pass
GSM109204.cel.gz GSM109204 rep1 GSM109204_rep1 MCF10A-Myc 30min baseline baseline_30min 35 05/23/02 1
GSM109205.cel.gz GSM109205 rep1 GSM109205_rep1 MCF10A-Myc 30min dex dex_30min 35 05/23/02 1
GSM109206.cel.gz GSM109206 rep1 GSM109206_rep1 MCF10A-Myc 2hr baseline baseline_2hr 35 05/23/02 1
GSM109207.cel.gz GSM109207 rep1 GSM109207_rep1 MCF10A-Myc 2hr dex dex_2hr 35 05/23/02 1
GSM109208.cel.gz GSM109208 rep1 GSM109208_rep1 MCF10A-Myc 4hr baseline baseline_4hr 35 05/23/02 1
GSM109209.cel.gz GSM109209 rep1 GSM109209_rep1 MCF10A-Myc 4hr dex dex_4hr 35 05/23/02 1
GSM109210.cel.gz GSM109210 rep1 GSM109210_rep1 MCF10A-Myc 24hr baseline baseline_24hr 35 05/31/02 1
GSM109211.cel.gz GSM109211 rep1 GSM109211_rep1 MCF10A-Myc 24hr dex dex_24hr 35 05/23/02 1
GSM109212.cel.gz GSM109212 rep2 GSM109212_rep2 MCF10A-Myc 30min baseline baseline_30min 35 05/31/02 1
GSM109213.cel.gz GSM109213 rep2 GSM109213_rep2 MCF10A-Myc 30min dex dex_30min 35 05/31/02 1
GSM109214.cel.gz GSM109214 rep2 GSM109214_rep2 MCF10A-Myc 2hr baseline baseline_2hr 35 05/31/02 1
GSM109215.cel.gz GSM109215 rep2 GSM109215_rep2 MCF10A-Myc 2hr dex dex_2hr 35 06/03/02 0
GSM109216.cel.gz GSM109216 rep2 GSM109216_rep2 MCF10A-Myc 4hr baseline baseline_4hr 35 05/31/02 1
GSM109217.cel.gz GSM109217 rep2 GSM109217_rep2 MCF10A-Myc 4hr dex dex_4hr 35 05/31/02 1
GSM109218.cel.gz GSM109218 rep2 GSM109218_rep2 MCF10A-Myc 24hr baseline baseline_24hr 35 05/31/02 1
GSM109219.cel.gz GSM109219 rep2 GSM109219_rep2 MCF10A-Myc 24hr dex dex_24hr 35 05/31/02 1
GSM109220.CEL.gz GSM109220 rep3 GSM109220_rep3 MCF10A-Myc 30min baseline baseline_30min 35 04/13/05 1
GSM109221.CEL.gz GSM109221 rep3 GSM109221_rep3 MCF10A-Myc 30min dex dex_30min 35 04/13/05 1
GSM109222.CEL.gz GSM109222 rep3 GSM109222_rep3 MCF10A-Myc 2hr baseline baseline_2hr 35 04/13/05 1
GSM109223.CEL.gz GSM109223 rep3 GSM109223_rep3 MCF10A-Myc 2hr dex dex_2hr 35 04/13/05 1
GSM109224.CEL.gz GSM109224 rep3 GSM109224_rep3 MCF10A-Myc 4hr baseline baseline_4hr 35 04/13/05 1
GSM109225.CEL.gz GSM109225 rep3 GSM109225_rep3 MCF10A-Myc 4hr dex dex_4hr 35 04/13/05 1
GSM109226.CEL.gz GSM109226 rep3 GSM109226_rep3 MCF10A-Myc 24hr baseline baseline_24hr 35 04/13/05 1
GSM109227.CEL.gz GSM109227 rep3 GSM109227_rep3 MCF10A-Myc 24hr dex dex_24hr 35 04/13/05 1
pData(raw.data)$Filename <- rownames(pData(raw.data)) # add filename in a new column

Save in a new phenotype file

write.table(pData(raw.data),paste0(datadir,"/",geo_id,"_Phenotype_withQC.txt"),col.names=T,row.names=F,sep="\t",quote=F)

Create variables for subsetting phenotype data into separate comparison groups (e.g. asthma vs. healthy, rhinitis vs. healthy). Assign NULL values if the corresponding disease/treatment status is not available. This is a manual step.

disease_con1 <- NULL
disease_con0 <- NULL
treatment_con1 <- c("dex_24hr","dex_2hr","dex_30min","dex_4hr")
treatment_con0 <- c("baseline_24hr","baseline_2hr","baseline_30min","baseline_4hr")

Subset phenotype file by tissue, disease status and treatment (if available). Differential gene expression analysis will be performed in a separate group.

Use subdat function to subset phenotype file into separate files than only contain one tissue type, one treatment/disease status of the disease vs. healthy/treatment vs. baseline comparison.

The phenotype file is named as: geo_id + Tissue + Disease/Treatment status (if available) + treatment vs baseline/disease vs healthy

subdat <- function(pheno) {
  tissues=levels(pheno$Tissue)
  for (t in tissues) {
    pheno_tis <- pheno[pheno$Tissue==t,]
    if (!is.null(disease_con1)&(!is.null(treatment_con1))) { # Data contains both disease status and treatment
      for (tr in levels(as.factor(c(treatment_con1,treatment_con0)))) { # under the same treatment, get disease and healthy samples
        pheno_tr <- pheno_tis[pheno_tis$Treatment==tr,]
        for (i in 1:length(disease_con1)) {
          if (disease_con1[i]%in%pheno_tr$Disease&disease_con0[i]%in%pheno_tr$Disease) { # under the same treatment, if the data contains both disease and healthy
            pheno_dis <- pheno_tr[pheno_tr$Disease%in%c(disease_con1[i],disease_con0[i]),]
            name <- paste0(resdir,"/",geo_id,"_",t,"_",tr,"_",disease_con1[i],"_vs_",disease_con0[i],".txt")
            write.table(pheno_dis,name,col.names=T,row.names=F,sep="\t",quote=F)
          }
        }
      }
      for (dis in levels(as.factor(c(disease_con1,disease_con0)))) { # under the same disease status, get treatment and baseline samples
        pheno_dis <- pheno_tis[pheno_tis$Disease==dis,]
        for (i in 1:length(treatment_con1)) {
          if (treatment_con1[i]%in%pheno_dis$Treatment&treatment_con0[i]%in%pheno_dis$Treatment) { # under the same disease status, if the data contains both treatment and baseline
            pheno_tr <- pheno_dis[pheno_dis$Treatment%in%c(treatment_con1[i],treatment_con0[i]),]
            name <- paste0(resdir,"/",geo_id,"_",t,"_",dis,"_",treatment_con1[i],"_vs_",treatment_con0[i],".txt")
            write.table(pheno_tr,name,col.names=T,row.names=F,sep="\t",quote=F)
          }
        }
      }
    }
    if (!is.null(disease_con1)&(is.null(treatment_con1))) { # Data only contains disease status
      for (i in 1:length(disease_con1)) {
        if (disease_con1[i]%in%pheno_tis$Disease&disease_con0[i]%in%pheno_tis$Disease) { # if the data contains both disease and healthy samples
          pheno_dis <- pheno_tis[pheno_tis$Disease%in%c(disease_con1[i],disease_con0[i]),]
          name <- paste0(resdir,"/",geo_id,"_",t,"_",disease_con1[i],"_vs_",disease_con0[i],".txt")
          write.table(pheno_dis,name,col.names=T,row.names=F,sep="\t",quote=F)
        }
      }
    }
    if (is.null(disease_con1)&(!is.null(treatment_con1))) { # Data only contains treatment status
      for (i in 1:length(treatment_con1)) {
        if (treatment_con1[i]%in%pheno_tis$Treatment&treatment_con0[i]%in%pheno_tis$Treatment) { # if the data contains both treatment and baseline samples
          pheno_tr <- pheno_tis[pheno_tis$Treatment%in%c(treatment_con1[i],treatment_con0[i]),]
          name <- paste0(resdir,"/",geo_id,"_",t,"_",treatment_con1[i],"_vs_",treatment_con0[i],".txt")
          write.table(pheno_tr,name,col.names=T,row.names=F,sep="\t",quote=F)
        }
      }
    }
  }
}
subdat(pData(raw.data))

Show all the generated separate phenotype files

list.files(path=resdir,pattern=paste0("^",geo_id,".*","_vs_",".*","\\.txt"))
## [1] "GSE4917_MCF10A-Myc_dex_24hr_vs_baseline_24hr.txt"  
## [2] "GSE4917_MCF10A-Myc_dex_2hr_vs_baseline_2hr.txt"    
## [3] "GSE4917_MCF10A-Myc_dex_30min_vs_baseline_30min.txt"
## [4] "GSE4917_MCF10A-Myc_dex_4hr_vs_baseline_4hr.txt"

Session information

sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] hgu133acdf_2.18.0   pd.hg.u133a_3.12.0  DBI_0.7            
##  [4] RSQLite_2.0         bindrcpp_0.2        dplyr_0.7.1        
##  [7] pander_0.6.0        devtools_1.13.3     Hmisc_4.0-3        
## [10] Formula_1.2-2       survival_2.41-3     lattice_0.20-33    
## [13] gplots_3.0.1        ggplot2_2.2.1       viridis_0.4.0      
## [16] viridisLite_0.2.0   oligo_1.38.0        Biostrings_2.42.1  
## [19] XVector_0.14.1      IRanges_2.8.2       S4Vectors_0.12.2   
## [22] oligoClasses_1.36.0 GEOquery_2.40.0     Biobase_2.34.0     
## [25] BiocGenerics_0.20.0
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6               bit64_0.9-7               
##  [3] RColorBrewer_1.1-2         httr_1.2.1                
##  [5] rprojroot_1.2              GenomeInfoDb_1.10.3       
##  [7] tools_3.3.1                backports_1.1.0           
##  [9] R6_2.2.2                   affyio_1.44.0             
## [11] rpart_4.1-10               KernSmooth_2.23-15        
## [13] lazyeval_0.2.0             colorspace_1.3-2          
## [15] nnet_7.3-12                withr_2.0.0               
## [17] gridExtra_2.2.1            bit_1.1-12                
## [19] preprocessCore_1.36.0      htmlTable_1.9             
## [21] labeling_0.3               caTools_1.17.1            
## [23] scales_0.4.1               checkmate_1.8.3           
## [25] stringr_1.2.0              digest_0.6.12             
## [27] foreign_0.8-66             rmarkdown_1.6             
## [29] base64enc_0.1-3            pkgconfig_2.0.1           
## [31] htmltools_0.3.6            htmlwidgets_0.9           
## [33] rlang_0.1.1                BiocInstaller_1.24.0      
## [35] bindr_0.1                  gtools_3.5.0              
## [37] acepack_1.4.1              RCurl_1.95-4.8            
## [39] magrittr_1.5               Matrix_1.2-6              
## [41] Rcpp_0.12.11               munsell_0.4.3             
## [43] stringi_1.1.5              yaml_2.1.14               
## [45] SummarizedExperiment_1.4.0 zlibbioc_1.20.0           
## [47] plyr_1.8.4                 grid_3.3.1                
## [49] affxparser_1.46.0          blob_1.1.0                
## [51] gdata_2.18.0               splines_3.3.1             
## [53] knitr_1.16                 GenomicRanges_1.26.4      
## [55] codetools_0.2-14           XML_3.98-1.9              
## [57] glue_1.1.1                 evaluate_0.10.1           
## [59] latticeExtra_0.6-28        data.table_1.10.4         
## [61] foreach_1.4.3              gtable_0.2.0              
## [63] assertthat_0.2.0           ff_2.2-13                 
## [65] tibble_1.3.3               iterators_1.0.8           
## [67] AnnotationDbi_1.36.2       memoise_1.1.0             
## [69] cluster_2.0.4